Getting data into R

Data structures

Why we’re starting here

“Bad programmers worry about the code. Good programmers worry about data structures and their relationships.”

— Linus Torvalds

HTRU2

Pulsar emission profiles from https://arxiv.org/pdf/1603.05166.pdf

The data

Measurements
1. Mean of the integrated profile
2. Standard deviation of the integrated profile
3. Excess kurtosis of the integrated profile
4. Skewness of the integrated profile
5. Mean of the DM-SNR curve
6. Standard deviation of the DM-SNR curve
7. Excess kurtosis of the DM-SNR curve
8. Skewness of the DM-SNR curve
9. True or false pulsar (human-verified)

The data file

Downloaded from: https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip

Where do we go from here?

First tip: work in scripts

Results from running code

What is HTRU2?

# Read in the CSV
HTRU2 <- read_csv(
  here("Data_Analyses_MATH_208/Datasets/HTRU2/HTRU_2.csv"),
  col_names=FALSE)
# Name the variables
names(HTRU2) = c("Mean_IP", "SD_IP", "EK_IP", "SKW_IP",
                 "Mean_DMSNR", "SD_DMSNR", "EK_DMSNR", "SKW_DMSNR", 
                 "Class")
class(HTRU2)
[1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 

The basics

The vector

Lengths_A <- c(52,51,60,64,69,74,78,84,86,96,104,112,118,125,132,135)
mode(Lengths_A)
[1] "numeric"
Lengths_A[1]
[1] 52
Lengths_A[1] <- 53
Lengths_A[1]
[1] 53

The vector

basic = c(1,2,3)
basic[5] = 5
basic
[1]  1  2  3 NA  5

The vector

author_list = c("J.K. Rowling", "Stephen King","Michael   Lewis",
                "Toni Morrison","David McCullough")
mode(author_list)
[1] "character"
boolean_vec = c(TRUE,FALSE,TRUE)
mode(boolean_vec)
[1] "logical"

Operators

Lengths_A
 [1]  53  51  60  64  69  74  78  84  86  96 104 112 118 125 132 135
Lengths_A + rep(1,16)
 [1]  54  52  61  65  70  75  79  85  87  97 105 113 119 126 133 136
1:9
[1] 1 2 3 4 5 6 7 8 9

Recycling vectors

Lengths_A / rep(2.54,16)
 [1] 20.86614 20.07874 23.62205 25.19685 27.16535 29.13386 30.70866
 [8] 33.07087 33.85827 37.79528 40.94488 44.09449 46.45669 49.21260
[15] 51.96850 53.14961
Lengths_A / 2.54
 [1] 20.86614 20.07874 23.62205 25.19685 27.16535 29.13386 30.70866
 [8] 33.07087 33.85827 37.79528 40.94488 44.09449 46.45669 49.21260
[15] 51.96850 53.14961
c(1,2,3,4)+c(3,5)
[1] 4 7 6 9

Functions and Methods

Examples of functions

Function name Argument Action
c Vector elements Creates vector
rep times/each/length.out Replicates vector
seq.int from/to/by/length.out/along.with Creates sequence of integers
is.vector Vector/mode Returns TRUE if
atomic vector

More examples of functions with differing arguments

sum(c(3,5,NA))
[1] NA
sum(c(3,5,NA),na.rm=T)
[1] 8
sum(c(3,5,7),c(1,1,1))
[1] 18

Methods

args(mean)
function (x, ...) 
NULL
args(mean.default)
function (x, trim = 0, na.rm = FALSE, ...) 
NULL
methods("mean")
 [1] mean,ANY-method          mean,Matrix-method      
 [3] mean,sparseMatrix-method mean,sparseVector-method
 [5] mean.Date                mean.default            
 [7] mean.difftime            mean.IDate*             
 [9] mean.POSIXct             mean.POSIXlt            
[11] mean.quosure*           
see '?methods' for accessing help and source code

Moving beyond atomic vectors

Generic vectors

X <- matrix(1:9,nrow=3,ncol=3)
X
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9
class(X)
[1] "matrix"
mode(X)
[1] "numeric"

Generic vectors

attributes(X)
$dim
[1] 3 3
dim(X)
[1] 3 3

Subscripting matrices

X
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9
X[2,1]
[1] 2
X[4]
[1] 4

Operators for matrices

X %*% t(X)
     [,1] [,2] [,3]
[1,]   66   78   90
[2,]   78   93  108
[3,]   90  108  126
X * t(X)
     [,1] [,2] [,3]
[1,]    1    8   21
[2,]    8   25   48
[3,]   21   48   81

Lists

List examples

U1 = c(203, 204)
U2 = c(323,324,447)
U3 = c(208,427,423,523,545)

mymcgill_stats = list(U1,U2,U3,"Statistics Major")

mymcgill_stats
[[1]]
[1] 203 204

[[2]]
[1] 323 324 447

[[3]]
[1] 208 427 423 523 545

[[4]]
[1] "Statistics Major"

Naming elements and subscripting

mymcgill_stats = list(U1=U1,U2=U2,U3=U3,Major = "Statistics Major")

mymcgill_stats
$U1
[1] 203 204

$U2
[1] 323 324 447

$U3
[1] 208 427 423 523 545

$Major
[1] "Statistics Major"
mymcgill_stats[["U2"]]
[1] 323 324 447

Comparing [[]], []. $

mymcgill_stats = list(U1=U1,U2=U2,U3=U3)

mymcgill_stats$U2
[1] 323 324 447
mymcgill_stats[["U2"]]
[1] 323 324 447
mymcgill_stats["U2"]
$U2
[1] 323 324 447

Comparing [[]], []. $ (cont.)

mymcgill_stats[c(2,3)]
$U2
[1] 323 324 447

$U3
[1] 208 427 423 523 545
mymcgill_stats[c("U1","U3")]
$U1
[1] 203 204

$U3
[1] 208 427 423 523 545

Comparing [[]], []. $ (cont.)

mymcgill_stats[[c(1,2)]] # Recursive indexing 1st of outer, 2nd of inner
[1] 204
mymcgill_stats[[1]][2] # Access vector, access 2nd element
[1] 204
mymcgill_stats[[1]][[2]] # Access vector, access 2nd element
[1] 204
mymcgill_stats[1]$U1[2] # Access list, access U1, access 2nd element
[1] 204

How do we think about datasets usually?

Like this

data.frame

htru2_df = read.csv(here("Data_Analyses_MATH_208/Datasets/HTRU2/HTRU_2.csv"),
                    header=FALSE)
class(htru2_df)
[1] "data.frame"
head(htru2_df)
         V1       V2          V3         V4       V5       V6        V7
1 140.56250 55.68378 -0.23457141 -0.6996484 3.199833 19.11043  7.975532
2 102.50781 58.88243  0.46531815 -0.5150879 1.677258 14.86015 10.576487
3 103.01562 39.34165  0.32332837  1.0511644 3.121237 21.74467  7.735822
4 136.75000 57.17845 -0.06841464 -0.6362384 3.642977 20.95928  6.896499
5  88.72656 40.67223  0.60086608  1.1234917 1.178930 11.46872 14.269573
6  93.57031 46.69811  0.53190485  0.4167211 1.636288 14.54507 10.621748
         V8 V9
1  74.24222  0
2 127.39358  0
3  63.17191  0
4  53.59366  0
5 252.56731  0
6 131.39400  0

tibble

library(tidyverse)
htru2_tbl = read_csv(here("Data_Analyses_MATH_208/Datasets/HTRU2/HTRU_2.csv"),
                    col_names =FALSE)
class(htru2_tbl)
[1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame" 

tibble vs. data.frame

htru2_tbl
# A tibble: 17,898 x 9
      X1    X2      X3     X4    X5    X6    X7    X8    X9
   <dbl> <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1 141.   55.7 -0.235  -0.700 3.20  19.1   7.98  74.2     0
 2 103.   58.9  0.465  -0.515 1.68  14.9  10.6  127.      0
 3 103.   39.3  0.323   1.05  3.12  21.7   7.74  63.2     0
 4 137.   57.2 -0.0684 -0.636 3.64  21.0   6.90  53.6     0
 5  88.7  40.7  0.601   1.12  1.18  11.5  14.3  253.      0
 6  93.6  46.7  0.532   0.417 1.64  14.5  10.6  131.      0
 7 119.   48.8  0.0315 -0.112 0.999  9.28 19.2  480.      0
 8 130.   39.8 -0.158   0.390 1.22  14.4  13.5  198.      0
 9 107.   52.6  0.453   0.170 2.33  14.5   9.00 108.      0
10 107.   39.5  0.466   1.16  4.08  25.0   7.40  57.8     0
# … with 17,888 more rows

tibble vs. data.frame (cont.)

head(as.data.frame(htru2_tbl))
         X1       X2          X3         X4       X5       X6        X7
1 140.56250 55.68378 -0.23457141 -0.6996484 3.199833 19.11043  7.975532
2 102.50781 58.88243  0.46531815 -0.5150879 1.677258 14.86015 10.576487
3 103.01562 39.34165  0.32332837  1.0511644 3.121237 21.74467  7.735822
4 136.75000 57.17845 -0.06841464 -0.6362384 3.642977 20.95928  6.896499
5  88.72656 40.67223  0.60086608  1.1234917 1.178930 11.46872 14.269573
6  93.57031 46.69811  0.53190485  0.4167211 1.636288 14.54507 10.621748
         X8 X9
1  74.24222  0
2 127.39358  0
3  63.17191  0
4  53.59366  0
5 252.56731  0
6 131.39400  0

tibble vs. data.frame, Round 2

mymcgill_stats_tbl = tibble(Courses=list(U1=U1,U2=U2,U3=U3), 
                            Year = c("U1","U2","U3"),
                            Major = rep("Statistics Major",3))
mymcgill_stats_tbl
# A tibble: 3 x 3
  Courses   Year  Major           
  <list>    <chr> <chr>           
1 <dbl [2]> U1    Statistics Major
2 <dbl [3]> U2    Statistics Major
3 <dbl [5]> U3    Statistics Major

Back to HTRU2

names(attributes(htru2_tbl))
[1] "names"     "class"     "row.names" "spec"     
attributes(htru2_tbl)$names
[1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9"
names(htru2_tbl)= c("Mean_IP", "SD_IP", "EK_IP", "SKW_IP",
                 "Mean_DMSNR", "SD_DMSNR", "EK_DMSNR", "SKW_DMSNR", 
                 "Class")
htru2_tbl
# A tibble: 17,898 x 9
   Mean_IP SD_IP   EK_IP SKW_IP Mean_DMSNR SD_DMSNR EK_DMSNR SKW_DMSNR
     <dbl> <dbl>   <dbl>  <dbl>      <dbl>    <dbl>    <dbl>     <dbl>
 1   141.   55.7 -0.235  -0.700      3.20     19.1      7.98      74.2
 2   103.   58.9  0.465  -0.515      1.68     14.9     10.6      127. 
 3   103.   39.3  0.323   1.05       3.12     21.7      7.74      63.2
 4   137.   57.2 -0.0684 -0.636      3.64     21.0      6.90      53.6
 5    88.7  40.7  0.601   1.12       1.18     11.5     14.3      253. 
 6    93.6  46.7  0.532   0.417      1.64     14.5     10.6      131. 
 7   119.   48.8  0.0315 -0.112      0.999     9.28    19.2      480. 
 8   130.   39.8 -0.158   0.390      1.22     14.4     13.5      198. 
 9   107.   52.6  0.453   0.170      2.33     14.5      9.00     108. 
10   107.   39.5  0.466   1.16       4.08     25.0      7.40      57.8
# … with 17,888 more rows, and 1 more variable: Class <dbl>